HealthCareAnalysis_SQL(Python)

Loading...
Copied!
 
Name
Age
Gender
Blood Type
Medical Condition
Date of Admission
Doctor
Hospital
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Tiffany Ramirez
81
Female
O-
Diabetes
2022-11-17
Patrick Parker
Wallace-Hamilton
Ruben Burns
35
Male
O+
Asthma
2023-06-01
Diane Jackson
Burke, Griffin and Cooper
Chad Byrd
61
Male
B-
Obesity
2019-01-09
Paul Baker
Walton LLC
Antonio Frederick
49
Male
B-
Asthma
2020-05-02
Brian Chandler
Garcia Ltd
Mrs. Brandy Flowers
51
Male
O-
Arthritis
2021-07-09
Dustin Griffin
Jones, Brown and Murray
Patrick Parker
41
Male
AB+
Arthritis
2020-08-20
Robin Green
Boyd PLC
Charles Horton
82
Male
AB+
Hypertension
2021-03-22
Patricia Bishop
Wheeler, Bryant and Johns
Patty Norman
55
Female
O-
Arthritis
2019-05-16
Brian Kennedy
Brown Inc
Ryan Hayes
33
Male
A+
Diabetes
2020-12-17
Kristin Dunn
Smith, Edwards and Obrien
Sharon Perez
39
Female
O-
Asthma
2022-12-15
Jessica Bailey
Brown-Golden
Amy Roberts
45
Male
B-
Cancer
2021-04-13
Anthony Roberts
Little-Spencer
Mrs. Caroline Farrell
23
Female
O-
Hypertension
2019-06-09
William Miller
Rose Inc
Christina Williams
85
Female
A+
Diabetes
2021-11-29
Laura Roberts
Malone, Thompson and Mejia
William Page
72
Female
A+
Diabetes
2021-07-29
James Carney
Richardson-Powell
Michael Bradshaw
65
Female
AB+
Cancer
2021-06-05
Katherine Lowe
Castaneda-Hardy
Brian Dorsey
32
Female
O+
Arthritis
2021-08-07
Curtis Smith
Burch-White
Olivia Gonzalez
64
Male
AB-
Diabetes
2019-11-15
Clayton Mcknight
Cunningham and Sons
10,000 rows
Copied!
 
Name
Age
Gender
Blood Type
Medical Condition
Date of Admission
Doctor
Hospital
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Tiffany Ramirez
81
Female
O-
Diabetes
2022-11-17
Patrick Parker
Wallace-Hamilton
Ruben Burns
35
Male
O+
Asthma
2023-06-01
Diane Jackson
Burke, Griffin and Cooper
Chad Byrd
61
Male
B-
Obesity
2019-01-09
Paul Baker
Walton LLC
Antonio Frederick
49
Male
B-
Asthma
2020-05-02
Brian Chandler
Garcia Ltd
Mrs. Brandy Flowers
51
Male
O-
Arthritis
2021-07-09
Dustin Griffin
Jones, Brown and Murray
Patrick Parker
41
Male
AB+
Arthritis
2020-08-20
Robin Green
Boyd PLC
Charles Horton
82
Male
AB+
Hypertension
2021-03-22
Patricia Bishop
Wheeler, Bryant and Johns
Patty Norman
55
Female
O-
Arthritis
2019-05-16
Brian Kennedy
Brown Inc
Ryan Hayes
33
Male
A+
Diabetes
2020-12-17
Kristin Dunn
Smith, Edwards and Obrien
Sharon Perez
39
Female
O-
Asthma
2022-12-15
Jessica Bailey
Brown-Golden
Amy Roberts
45
Male
B-
Cancer
2021-04-13
Anthony Roberts
Little-Spencer
Mrs. Caroline Farrell
23
Female
O-
Hypertension
2019-06-09
William Miller
Rose Inc
Christina Williams
85
Female
A+
Diabetes
2021-11-29
Laura Roberts
Malone, Thompson and Mejia
William Page
72
Female
A+
Diabetes
2021-07-29
James Carney
Richardson-Powell
Michael Bradshaw
65
Female
AB+
Cancer
2021-06-05
Katherine Lowe
Castaneda-Hardy
Brian Dorsey
32
Female
O+
Arthritis
2021-08-07
Curtis Smith
Burch-White
Olivia Gonzalez
64
Male
AB-
Diabetes
2019-11-15
Clayton Mcknight
Cunningham and Sons
10,000 rows
all_columns=df.columns
print(all_columns)
['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']
newdf=df.withColumnRenamed("Blood Type","Blood_Type").withColumnRenamed("Medical Condition","Medical_Condition").withColumnRenamed("Date of Admission","Date_Admited").withColumnRenamed("Insurance Provider","Insurance_Provider").withColumnRenamed("Billing Amount","Billing_Amount").withColumnRenamed("Room Number","Room_Number").withColumnRenamed("Admission Type","Admission_Type").withColumnRenamed("Discharge Date","Discharge_Date").withColumnRenamed("Test Results","Test_results")
display(newdf)
Copied!
 
Name
Age
Gender
Blood_Type
Medical_Condition
Date_Admited
Doctor
Hospital
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Tiffany Ramirez
81
Female
O-
Diabetes
2022-11-17
Patrick Parker
Wallace-Hamilton
Ruben Burns
35
Male
O+
Asthma
2023-06-01
Diane Jackson
Burke, Griffin and Cooper
Chad Byrd
61
Male
B-
Obesity
2019-01-09
Paul Baker
Walton LLC
Antonio Frederick
49
Male
B-
Asthma
2020-05-02
Brian Chandler
Garcia Ltd
Mrs. Brandy Flowers
51
Male
O-
Arthritis
2021-07-09
Dustin Griffin
Jones, Brown and Murray
Patrick Parker
41
Male
AB+
Arthritis
2020-08-20
Robin Green
Boyd PLC
Charles Horton
82
Male
AB+
Hypertension
2021-03-22
Patricia Bishop
Wheeler, Bryant and Johns
Patty Norman
55
Female
O-
Arthritis
2019-05-16
Brian Kennedy
Brown Inc
Ryan Hayes
33
Male
A+
Diabetes
2020-12-17
Kristin Dunn
Smith, Edwards and Obrien
Sharon Perez
39
Female
O-
Asthma
2022-12-15
Jessica Bailey
Brown-Golden
Amy Roberts
45
Male
B-
Cancer
2021-04-13
Anthony Roberts
Little-Spencer
Mrs. Caroline Farrell
23
Female
O-
Hypertension
2019-06-09
William Miller
Rose Inc
Christina Williams
85
Female
A+
Diabetes
2021-11-29
Laura Roberts
Malone, Thompson and Mejia
William Page
72
Female
A+
Diabetes
2021-07-29
James Carney
Richardson-Powell
Michael Bradshaw
65
Female
AB+
Cancer
2021-06-05
Katherine Lowe
Castaneda-Hardy
Brian Dorsey
32
Female
O+
Arthritis
2021-08-07
Curtis Smith
Burch-White
Olivia Gonzalez
64
Male
AB-
Diabetes
2019-11-15
Clayton Mcknight
Cunningham and Sons
10,000 rows
print(newdf.columns)
['Name', 'Age', 'Gender', 'Blood_Type', 'Medical_Condition', 'Date_Admited', 'Doctor', 'Hospital', 'Insurance_Provider', 'Billing_Amount', 'Room_Number', 'Admission_Type', 'Discharge_Date', 'Medication', 'Test_results']
newdf.createOrReplaceTempView("Sample2")
#dataset
display(spark.sql("Select * from sample2"))
Copied!
 
Name
Age
Gender
Blood_Type
Medical_Condition
Date_Admited
Doctor
Hospital
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Tiffany Ramirez
81
Female
O-
Diabetes
2022-11-17
Patrick Parker
Wallace-Hamilton
Ruben Burns
35
Male
O+
Asthma
2023-06-01
Diane Jackson
Burke, Griffin and Cooper
Chad Byrd
61
Male
B-
Obesity
2019-01-09
Paul Baker
Walton LLC
Antonio Frederick
49
Male
B-
Asthma
2020-05-02
Brian Chandler
Garcia Ltd
Mrs. Brandy Flowers
51
Male
O-
Arthritis
2021-07-09
Dustin Griffin
Jones, Brown and Murray
Patrick Parker
41
Male
AB+
Arthritis
2020-08-20
Robin Green
Boyd PLC
Charles Horton
82
Male
AB+
Hypertension
2021-03-22
Patricia Bishop
Wheeler, Bryant and Johns
Patty Norman
55
Female
O-
Arthritis
2019-05-16
Brian Kennedy
Brown Inc
Ryan Hayes
33
Male
A+
Diabetes
2020-12-17
Kristin Dunn
Smith, Edwards and Obrien
Sharon Perez
39
Female
O-
Asthma
2022-12-15
Jessica Bailey
Brown-Golden
Amy Roberts
45
Male
B-
Cancer
2021-04-13
Anthony Roberts
Little-Spencer
Mrs. Caroline Farrell
23
Female
O-
Hypertension
2019-06-09
William Miller
Rose Inc
Christina Williams
85
Female
A+
Diabetes
2021-11-29
Laura Roberts
Malone, Thompson and Mejia
William Page
72
Female
A+
Diabetes
2021-07-29
James Carney
Richardson-Powell
Michael Bradshaw
65
Female
AB+
Cancer
2021-06-05
Katherine Lowe
Castaneda-Hardy
Brian Dorsey
32
Female
O+
Arthritis
2021-08-07
Curtis Smith
Burch-White
Olivia Gonzalez
64
Male
AB-
Diabetes
2019-11-15
Clayton Mcknight
Cunningham and Sons
10,000 rows
#total paients admited in differnet hospitals
display(spark.sql("select hospital,count(distinct(name)) as total_patient_count from sample2 group by hospital order by total_patient_count desc"))
Copied!
 
hospital
total_patient_count
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Smith PLC
19
Smith and Sons
17
Smith Inc
14
Smith Ltd
14
Johnson PLC
13
Williams LLC
12
Williams Inc
12
Smith Group
12
Johnson Ltd
11
Johnson Group
11
Thomas Group
11
Brown LLC
10
Miller and Sons
9
Jones LLC
9
Davis and Sons
9
Jones Inc
9
Williams Group
9
8,639 rows
#Gender based average age of developing a particular disease
display(spark.sql("select medical_condition,gender,round(avg(age),0) as avg_age from sample2 group by medical_condition,gender order by medical_condition,gender"))
Copied!
 
medical_condition
gender
avg_age
1
2
3
4
5
6
7
8
9
10
11
12
Arthritis
Female
51
Arthritis
Male
52
Asthma
Female
52
Asthma
Male
50
Cancer
Female
52
Cancer
Male
51
Diabetes
Female
52
Diabetes
Male
52
Hypertension
Female
51
Hypertension
Male
51
Obesity
Female
51
Obesity
Male
52
12 rows
#Different medical conditions
display(spark.sql("select distinct(Medical_condition) from sample2"))
Copied!
 
Medical_condition
1
2
3
4
5
6
Obesity
Diabetes
Arthritis
Hypertension
Cancer
Asthma
6 rows
#Total revenue generated by all the hospitals
display(spark.sql("select round(sum(Billing_Amount),3) as Total_Revenue from sample2"))
Copied!
 
Total_Revenue
1
255168067.777
1 row
#Count of cases based on diseases
display(
    spark.sql("select Medical_condition,count(Medical_condition) as Toatl_Count from sample2 group by Medical_condition order by count(Medical_condition) desc "
    )
)
Copied!
 
Medical_condition
Toatl_Count
1
2
3
4
5
6
Asthma
1708
Cancer
1703
Hypertension
1688
Arthritis
1650
Obesity
1628
Diabetes
1623
6 rows
#Total revenue generated by hospitals
display(spark.sql("select Hospital,round(sum(Billing_Amount),2) from Sample2 group by Hospital order by sum(Billing_Amount) desc limit 5"))
Copied!
 
Hospital
round(sum(Billing_Amount), 2)
1
2
3
4
5
Smith and Sons
477638.88
Smith PLC
432283.55
Smith Ltd
428163.07
Smith Inc
351463.89
Williams LLC
327522.47
5 rows
#Total number of male and female patients
display(spark.sql("select gender, count(*) from sample2 group by gender"))
Copied!
 
gender
count(1)
1
2
Female
5075
Male
4925
2 rows
#Gender based count of diseases
display(spark.sql("select medical_condition,sum(case when gender='Female' then 1 else 0 end) as Female_count,sum(case when gender='Male' then 1 else 0 end) as Male_count from sample2 group by medical_condition"))
Copied!
 
medical_condition
Female_count
Male_count
1
2
3
4
5
6
Obesity
838
790
Diabetes
825
798
Arthritis
815
835
Hypertension
836
852
Cancer
887
816
Asthma
874
834
6 rows
#Average days of treatment for diseases
display(spark.sql("Select medical_condition,round(avg(datediff(discharge_date,date_admited)),0)as avg_days_of_treatment from sample2 group by medical_condition"))
Copied!
 
medical_condition
avg_days_of_treatment
1
2
3
4
5
6
Obesity
15
Diabetes
16
Arthritis
16
Hypertension
15
Cancer
15
Asthma
15
6 rows
#Count of different types of admissions in different hospitals
display(spark.sql("select hospital, sum(case when admission_type='Emergency' then 1 else 0 end)as Emergency,sum(case when admission_type='Elective' then 1 else 0 end)as Elective,sum(case when admission_type='Urgent' then 1 else 0 end)as Urgent from sample2 group by hospital order by emergency desc"))
Copied!
 
hospital
Emergency
Elective
Urgent
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Smith PLC
8
5
6
Johnson PLC
8
3
2
Smith Inc
6
3
5
Smith Ltd
6
4
4
Williams LLC
6
2
4
Thomas Group
5
5
1
Smith Group
5
3
4
Smith and Sons
5
6
6
Anderson and Sons
5
1
1
Miller and Sons
4
3
2
Johnson Ltd
4
3
4
Thompson PLC
4
0
1
Jones Group
4
3
1
Thomas and Sons
4
1
1
Davis LLC
4
0
1
Harris and Sons
3
3
0
Wright PLC
3
0
0
8,639 rows
#Total insurance amount provided by each provider
display(spark.sql("select insurance_provider,round(sum(billing_amount),3) as total_amount from sample2 group by insurance_provider order by total_amount desc"))
Copied!
 
insurance_provider
total_amount
1
2
3
4
5
Cigna
52340171.592
Aetna
52321794.76
Blue Cross
52125858.902
UnitedHealthcare
50250467.698
Medicare
48129774.825
5 rows
#Insurance provided bases in diseases
display(spark.sql("select insurance_provider,medical_condition,round(sum(billing_amount),3) as total_amount,count(name) as total_patients_claimed from sample2 group by insurance_provider,medical_condition order by insurance_provider,total_amount desc"))
Copied!
 
insurance_provider
medical_condition
total_amount
total_patients_claimed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Aetna
Cancer
9532886.621
362
Aetna
Hypertension
9194449.164
348
Aetna
Asthma
9112237.882
368
Aetna
Diabetes
8651797.059
324
Aetna
Obesity
8224407.73
315
Aetna
Arthritis
7606016.305
308
Blue Cross
Asthma
9458174.188
368
Blue Cross
Arthritis
9044351.478
348
Blue Cross
Diabetes
9017073.505
339
Blue Cross
Hypertension
8569801.696
322
Blue Cross
Obesity
8331502.421
339
Blue Cross
Cancer
7704955.614
316
Cigna
Cancer
9464660.799
358
Cigna
Asthma
9045429.048
340
Cigna
Arthritis
8791265.989
349
Cigna
Hypertension
8596057.369
356
Cigna
Obesity
8309440.116
318
30 rows
#Count of different cases based on diseases
display(spark.sql("select medical_condition, sum(case when test_results='Normal' then 1 else 0 end)as Normal,sum(case when test_results='Inconclusive' then 1 else 0 end)as Inconclusive,sum(case when test_results='Abnormal' then 1 else 0 end)as abnormal from sample2 group by medical_condition"))
Copied!
 
medical_condition
Normal
Inconclusive
abnormal
1
2
3
4
5
6
Obesity
545
521
562
Diabetes
544
542
537
Arthritis
542
553
555
Hypertension
532
554
602
Cancer
570
556
577
Asthma
534
551
623
6 rows
#Top 3 doctors for each disease 
display(spark.sql("with cte as(select medical_condition,doctor,count(doctor) as cnt from sample2 group by medical_condition,doctor order by medical_condition asc, cnt desc),cte2 as(select *,row_number() over(partition by medical_condition order by cnt desc) as drn from cte) select medical_condition,doctor from cte2 where drn<=3"))
Copied!
 
medical_condition
doctor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Arthritis
Patricia Moore
Arthritis
John Skinner
Arthritis
Steven Edwards
Asthma
Michael Smith
Asthma
Michael Johnson
Asthma
Stephanie Jones
Cancer
Aaron Smith
Cancer
Susan Miller
Cancer
Joseph Morales
Diabetes
Matthew Smith
Diabetes
Logan Reynolds
Diabetes
Andrew Adams
Hypertension
David Johnson
Hypertension
Jennifer Miller
Hypertension
Michael Nolan
Obesity
Jennifer Smith
Obesity
Robert Brown
18 rows
# Count of Blood Groups
display(spark.sql("select Blood_type,count(blood_type) as cnt from sample2 group by blood_type order by cnt desc"))
Copied!
 
Blood_type
cnt
1
2
3
4
5
6
7
8
AB-
1275
AB+
1258
B-
1252
O+
1248
O-
1244
B+
1244
A+
1241
A-
1238
8 rows